In [ ]:
!pip install lxml
!pip install BeautifulSoup4
In [ ]:
import urllib.request
from lxml import html
from bs4 import BeautifulSoup
In [ ]:
# Scrape all HTML from webpage.
def scrapewebpage(url):
# Open URL and get HTML.
web = urllib.request.urlopen(url)
# Make sure there wasn't any errors opening the URL.
if (web.getcode() == 200):
html = web.read()
return(html)
else:
print("Error %s reading %s" % str(web.getcode()), url)
# Helper function that scrape the webpage and turn it into soup.
def makesoup(url):
html = scrapewebpage(url)
return(BeautifulSoup(html, "lxml"))
In [ ]:
# Scrape Interstellar (2014) by using our own function "makesoup" we defined above.
movie_soup = makesoup('http://www.imdb.com/title/tt0816692/')
In [ ]:
# Get movie title.
title = movie_soup.find(itemprop="name").get_text()
title = title.strip() # Remove whitespace before and after text
# Get movie year.
year = movie_soup.find(id="titleYear").get_text()
year = year[1:5] # Remove parentheses, make (2014) into 2014.
# Get movie duration.
duration = movie_soup.find(itemprop="duration").get_text()
duration = duration.strip() # Remove whitespace before and after text
# Get director.
director = movie_soup.find(itemprop="director").get_text()
director = director.strip() # Remove whitespace before and after text
# Get movie rating.
rating = movie_soup.find(itemprop="ratingValue").get_text()
# Get cast list.
actors = []
for castlist in movie_soup.find_all("table", "cast_list"):
for actor in castlist.find_all(itemprop="actor"):
actors.append(actor.get_text().strip())
In [ ]:
# Present the results.
print("Movie: " + title)
print("Year: " + year)
print("Director: " + director)
print("Duration: " + duration)
print("Rating: " + rating)
# Present list of actors.
print()
print("Main actors:")
for actor in actors:
print("- " + actor)
In [ ]:
wpost_soup = makesoup("http://www.washingtonpost.com/")
In [ ]:
# Get headlines.
headlines = wpost_soup.find_all("div", "headline")
print("Found " + str(len(headlines)) + " headlines")
In [ ]:
# Print headlines.
for headline in headlines:
print(headline.get_text().strip())
In [ ]:
# Print headlines and links.
for links in headlines:
for link in links.find_all("a"):
print(link.get_text())
print(link.get("href"))
print()
In [ ]:
# Get all the links on the page.
for link in wpost_soup.find_all("a"):
href = link.get("href")
if href is not None:
if href[:4] == "http":
print(href)
In [ ]:
wiki_soup = makesoup("https://en.wikipedia.org/wiki/Parliamentary_Assembly_of_the_Council_of_Europe")
In [ ]:
# Lets find the table "Composition by parliamentary delegation".
# The table doesn't have a unique name, which makes it difficult to scrape.
# However, it's the first table. So we can use find, which returns the first match.
table = wiki_soup.find("table")
In [ ]:
# Go through all rows in the table.
for row in table.find_all("tr"):
# Go through all cells in each row.
cell = row.find_all("td")
if len(cell) == 3:
# Extract the text from the three cells.
country = cell[0].get_text()
seats = cell[1].get_text()
accessiondate = cell[2].get_text()
print(country + ": " + seats + " seats (" + accessiondate + ")")
In [ ]:
# Modify this to your favorite movie.
soup = makesoup('http://www.imdb.com/title/tt0816692/')
In [ ]:
# Get rating count instead of name.
title = soup.find(itemprop="name").get_text()
title = title.strip() # Remove whitespace before and after text